{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# originally from https://github.com/brightmart/nlp_chinese_corpus\n",
    "# !unzip wiki_zh_2019.zip\n",
    "# !unzip new2016zh.zip\n",
    "# !rm wiki_zh_2019.zip new2016zh.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "from tqdm import tqdm\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n"
     ]
    }
   ],
   "source": [
    "import malaya\n",
    "language_detection = malaya.language_detection.fasttext(model = 'mesolitica/fasttext-language-detection-v1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "wiki = glob('wiki_zh/*/*')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "def slide(text, n = 1024):\n",
    "    return [text[i: i + n] for i in range(0, len(text), n)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2430752 news2016zh_train.json\r\n"
     ]
    }
   ],
   "source": [
    "!wc -l news2016zh_train.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2430752it [03:29, 11578.41it/s]\n",
      "100%|███████████████████████████████████████| 1274/1274 [00:42<00:00, 30.06it/s]\n"
     ]
    }
   ],
   "source": [
    "with open('prepare-standard-mandarin.jsonl', 'w') as fopen_l:\n",
    "    \n",
    "    with open('news2016zh_train.json') as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l)\n",
    "            slided = slide(data['content'])\n",
    "            for s in slided:\n",
    "                if not len(s.strip()):\n",
    "                    continue\n",
    "\n",
    "                if language_detection.predict([s])[0] not in {'other'}:\n",
    "                    continue\n",
    "\n",
    "                fopen_l.write(f'{json.dumps(s)}\\n')\n",
    "                \n",
    "    for f in tqdm(wiki):\n",
    "        with open(f) as fopen:\n",
    "            for l in fopen:\n",
    "                data = json.loads(l)\n",
    "                slided = slide(data['text'])\n",
    "                for s in slided:\n",
    "                    if not len(s.strip()):\n",
    "                        continue\n",
    "\n",
    "                    if language_detection.predict([s])[0] not in {'other'}:\n",
    "                        continue\n",
    "\n",
    "                    fopen_l.write(f'{json.dumps(s)}\\n')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
